Expanding the work. Find a publicly available data set and apply the same workflow and adapt some of the code to make it work.
To look for new scRNA-seq data the 10xgenomics homepage was used. The data-set contains Human peripheral blood mononuclear cells (PBMCs) of a healthy female donor (age: 25-30). For library generation ~16,000 cells (11,996 cells recovered) were sequenced on an Illumina NovaSeq 6000 to a read depth of approximately 40,000 mean reads per cell. The summary shows that more than 40,000 reads per cell and app. 2000 genes per cell (median) could be detected. The quality scores for sequencing and mapping show satisfactory results.
###################
#load libraries
###################
library(dplyr)
##
## Attache Paket: 'dplyr'
## Die folgenden Objekte sind maskiert von 'package:stats':
##
## filter, lag
## Die folgenden Objekte sind maskiert von 'package:base':
##
## intersect, setdiff, setequal, union
library(Seurat)
## Attaching SeuratObject
library(patchwork)
library(ggplot2)
############################
#Setup the Seurat Object
############################
# Load the PBMC dataset
pbmc.data <- Read10X(data.dir = "./data_new/002/")
# Initialize the Seurat object with the raw (non-normalized data).
pbmc <- CreateSeuratObject(counts = pbmc.data, project = "pbmc10k", min.cells = 3, min.features = 200)
pbmc
## An object of class Seurat
## 22302 features across 11922 samples within 1 assay
## Active assay: RNA (22302 features, 0 variable features)
Visualize QC metrics as a violin plot * The metrics showed a different distribution compared to the tutorial. The cutoff values will therefore have to be adjusted
# The [[ operator can add columns to object metadata. This is a great place to stash QC stats
pbmc[["percent.mt"]] <- PercentageFeatureSet(pbmc, pattern = "^MT-")
# Visualize QC metrics as a violin plot
VlnPlot(pbmc, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3)
Similar to the tutorial, there is a strong correlation between nCount and nFeature
# FeatureScatter is typically used to visualize feature-feature relationships, but can be used
# for anything calculated by the object, i.e. columns in object metadata, PC scores etc.
plot1 <- FeatureScatter(pbmc, feature1 = "nCount_RNA", feature2 = "percent.mt")
plot2 <- FeatureScatter(pbmc, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
plot1 + plot2
Based on the violin plots, the following criteria are used: * 500 < nFeature_RNA < 5000 * percent.mt < 15 %
pbmc <- subset(pbmc, subset = nFeature_RNA > 500 & nFeature_RNA < 5000 & percent.mt < 15)
pbmc <- NormalizeData(pbmc)
pbmc <- FindVariableFeatures(pbmc, selection.method = "vst", nfeatures = 2000)
# Identify the 10 most highly variable genes
top10 <- head(VariableFeatures(pbmc), 10)
# plot variable features with and without labels
plot1 <- VariableFeaturePlot(pbmc)
plot2 <- LabelPoints(plot = plot1, points = top10, repel = TRUE)
## When using repel, set xnudge and ynudge to 0 for optimal results
plot1 + plot2
all.genes <- rownames(pbmc)
pbmc <- ScaleData(pbmc, features = all.genes)
## Centering and scaling data matrix
pbmc <- RunPCA(pbmc, features = VariableFeatures(object = pbmc))
## PC_ 1
## Positive: FGL2, FCN1, LYZ, CST3, CTSS, MNDA, TYMP, CYBB, SERPINA1, IFI30
## PSAP, NCF2, LST1, S100A9, TYROBP, AIF1, MPEG1, CSTA, TNFSF13B, FTL
## TMEM176B, DUSP6, CD68, FOS, GRN, MS4A6A, DUSP1, S100A8, AC020656.1, SPI1
## Negative: LTB, CD3E, IL32, CD3D, TRAC, CD3G, PCED1B-AS1, TRBC2, ETS1, BCL11B
## MALAT1, IL7R, ARL4C, CD2, LIME1, TCF7, CD7, LINC00861, PRKCQ-AS1, CD27
## PIK3IP1, CCR7, FCMR, CD247, TRBC1, GZMM, LEF1, NOSIP, ISG20, IKZF3
## PC_ 2
## Positive: IL32, CD3E, CD3D, GZMM, CD3G, CD2, ANXA1, CTSW, CD247, CD7
## BCL11B, TRAC, S100A4, GZMA, NKG7, CST7, LINC00861, PRF1, IL7R, TMSB4X
## KLRK1, GNLY, CCL5, ARL4C, KLRD1, S100A10, SAMD3, TRBC1, ID2, PRKCQ-AS1
## Negative: MS4A1, CD79A, IGHM, BANK1, CD79B, LINC00926, RALGPS2, TNFRSF13C, NIBAN3, IGHD
## BCL11A, SPIB, IGKC, CD22, HLA-DQA1, AFF3, LINC02397, PAX5, FCRL1, VPREB3
## BLK, FCER2, CD74, COBLL1, HLA-DRA, TCF4, BLNK, SWAP70, HLA-DRB1, GNG7
## PC_ 3
## Positive: CCR7, LEF1, IL7R, TCF7, PIK3IP1, MAL, RCAN3, TRABD2A, NOSIP, LTB
## CD27, CAMK4, OXNAD1, PRKCQ-AS1, PDE3B, TRAC, CHRM3-AS2, FHIT, LRRN3, BCL11B
## PASK, NELL2, S100A12, RGCC, TRAT1, CD3G, VCAN, ADTRP, S100A8, CD3D
## Negative: GZMB, NKG7, GNLY, KLRD1, CST7, PRF1, CLIC3, GZMA, FGFBP2, KLRF1
## CCL4, HOPX, SPON2, GZMH, CCL5, ADGRG1, C12orf75, PTGDR, CD160, FCGR3A
## TBX21, XCL2, CTSW, MATK, IL2RB, PLEK, TRDC, CTSC, LAIR2, S1PR5
## PC_ 4
## Positive: CD79B, MS4A1, FCGR3A, CD79A, LINC00926, GNLY, IGHD, TNFRSF13C, BANK1, KLRD1
## FCER2, NKG7, CD22, CCL4, FGFBP2, PRF1, FCRL1, CST7, PAX5, MTSS1
## FCRL3, RALGPS2, SWAP70, VPREB3, IFITM2, HOPX, KLRF1, GZMA, LINC02397, GZMH
## Negative: LILRA4, CLEC4C, SERPINF1, LRRC26, TPM2, SCT, MAP1A, DNASE1L3, EPHB1, LINC00996
## LAMP5, ITM2C, PTCRA, TNFRSF21, PACSIN1, CIB2, SMPD3, SCAMP5, PHEX, PLD4
## SCN9A, PLEKHD1, DERL3, IL3RA, ZFAT, NRP1, AC097375.1, SLC35F3, GAS6, SMIM5
## PC_ 5
## Positive: CDKN1C, HES4, TCF7L2, CTSL, BATF3, CSF1R, FCGR3A, MS4A7, CKB, SIGLEC10
## CASP5, RRAS, IFITM3, AC104809.2, MS4A4A, MTSS1, CALML4, NAP1L1, NEURL1, SMIM25
## AC064805.1, CAMK1, RHOC, ABI3, GPBAR1, HMOX1, ZNF703, RNASET2, FAM110A, HCAR3
## Negative: S100A12, SLC2A3, VCAN, ITGAM, CYP1B1, PADI4, S100A8, CES1, MEGF9, MGST1
## VNN2, QPCT, MCEMP1, THBS1, CD14, RBP7, CTSD, CR1, AC020916.1, BST1
## MARC1, CKAP4, PLBD1, ANXA1, F5, ALOX5AP, CRISPLD2, RNASE2, AC020656.1, HMGB2
# Examine and visualize PCA results a few different ways
print(pbmc[["pca"]], dims = 1:5, nfeatures = 5)
## PC_ 1
## Positive: FGL2, FCN1, LYZ, CST3, CTSS
## Negative: LTB, CD3E, IL32, CD3D, TRAC
## PC_ 2
## Positive: IL32, CD3E, CD3D, GZMM, CD3G
## Negative: MS4A1, CD79A, IGHM, BANK1, CD79B
## PC_ 3
## Positive: CCR7, LEF1, IL7R, TCF7, PIK3IP1
## Negative: GZMB, NKG7, GNLY, KLRD1, CST7
## PC_ 4
## Positive: CD79B, MS4A1, FCGR3A, CD79A, LINC00926
## Negative: LILRA4, CLEC4C, SERPINF1, LRRC26, TPM2
## PC_ 5
## Positive: CDKN1C, HES4, TCF7L2, CTSL, BATF3
## Negative: S100A12, SLC2A3, VCAN, ITGAM, CYP1B1
VizDimLoadings(pbmc, dims = 1:2, reduction = "pca")
DimPlot(pbmc, reduction = "pca")
DimHeatmap(pbmc, dims = 1, cells = 500, balanced = TRUE)
DimHeatmap(pbmc, dims = 1:9, cells = 500, balanced = TRUE)
Due to the larger dataset, more dimensions were required to capture the full variation of cells. The Jackstraw procedure and the elbow plot produce different results, and the elbow plot is difficult to interpret at a high number of PCs. Therefore, we chose to orient ourselves on the Jackstraw plot and “err on the high side”, and used 40 components for clustering.
pbmc <- JackStraw(pbmc, num.replicate = 100, dims = 50) # Takes a long time (~ 20 min on my PC)
pbmc <- ScoreJackStraw(pbmc, dims = 1:50)
JackStrawPlot(pbmc, dims = 1:40)
## Warning: Removed 60469 rows containing missing values (geom_point).
ElbowPlot(pbmc, ndims = 50)
# optionally save the current state of the data
#saveRDS(pbmc, file = "./pbmc_newdata.rds")
The resolution had to be adjusted to detect the different cell types and subtypes. We found that with a resolution of 0.1, the clusters represented basic cell types (T, B, Mono, etc.). To also capture the various subgroups of cells, a resolution of 1 was used. 21 different clusters were found, compared to 9 in the tutorial dataset.
pbmc <- FindNeighbors(pbmc, dims = 1:40)
## Computing nearest neighbor graph
## Computing SNN
pbmc <- FindClusters(pbmc, resolution = 1)
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 11482
## Number of edges: 465145
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8681
## Number of communities: 21
## Elapsed time: 1 seconds
# Look at cluster IDs of the first 5 cells
head(Idents(pbmc), 5)
## AAACCCAAGGCCCAAA-1 AAACCCAAGTAATACG-1 AAACCCAAGTCACACT-1 AAACCCACAAAGCGTG-1
## 10 0 5 3
## AAACCCACAATCGAAA-1
## 6
## Levels: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
The same number of dimensions as in clustering had to be used here.
pbmc <- RunUMAP(pbmc, dims = 1:40)
## Warning: The default method for RunUMAP has changed from calling Python UMAP via reticulate to the R-native UWOT using the cosine metric
## To use Python UMAP via reticulate, set umap.method to 'umap-learn' and metric to 'correlation'
## This message will be shown once per session
## 19:51:26 UMAP embedding parameters a = 0.9922 b = 1.112
## 19:51:26 Read 11482 rows and found 40 numeric columns
## 19:51:26 Using Annoy for neighbor search, n_neighbors = 30
## 19:51:26 Building Annoy index with metric = cosine, n_trees = 50
## 0% 10 20 30 40 50 60 70 80 90 100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 19:51:28 Writing NN index file to temp file C:\Users\Daniel\AppData\Local\Temp\RtmpuQca4G\file4348208a76d7
## 19:51:28 Searching Annoy index using 1 thread, search_k = 3000
## 19:51:31 Annoy recall = 100%
## 19:51:31 Commencing smooth kNN distance calibration using 1 thread
## 19:51:32 Initializing from normalized Laplacian + noise
## 19:51:33 Commencing optimization for 200 epochs, with 502384 positive edges
## 19:51:46 Optimization finished
# `label = TRUE` to help identify the clusters
DimPlot(pbmc, reduction = "umap", label = TRUE)
#run TSNE
pbmc <- RunTSNE(pbmc, dims = 1:40)
DimPlot(pbmc, reduction = "tsne", label = TRUE)
Overall, tSNE results in a better separation of clusters in our opinion. Clusters that were not fully separated using umap (clusters 0, 4, 12) were separate here.
Find markers for every cluster compared to all remaining cells, report only the positive ones.
# find markers for every cluster compared to all remaining cells, report only the positive
# ones
pbmc.markers <- FindAllMarkers(pbmc, only.pos = TRUE, min.pct = 0.25, logfc.threshold = 0.25)
## Calculating cluster 0
## Calculating cluster 1
## Calculating cluster 2
## Calculating cluster 3
## Calculating cluster 4
## Calculating cluster 5
## Calculating cluster 6
## Calculating cluster 7
## Calculating cluster 8
## Calculating cluster 9
## Calculating cluster 10
## Calculating cluster 11
## Calculating cluster 12
## Calculating cluster 13
## Calculating cluster 14
## Calculating cluster 15
## Calculating cluster 16
## Calculating cluster 17
## Calculating cluster 18
## Calculating cluster 19
## Calculating cluster 20
pbmc.markers %>%
group_by(cluster) %>%
slice_max(n = 10, order_by = avg_log2FC) -> top10
top10
The ten most significant genes for each cluster are printed. By examining these genes, a first indication of the cell type making up some of the clusters can be gained.
0, 10, 12: Monocytes (CD14, FCGR3A) 1, 11: CD8+ T cells (CD8A, CD8B) 6, 7, 17, 19: B cells (IG…) 8: NK cells (NKG7) 18: Platelets (PPBP)
To conclusively determine the cell type of each cluster, we researched relevant marker genes for PBMCs. The following assignment is based on the last section of the Seurat tutorial as well as Wang et al., 2021 (https://doi.org/10.1038/s41467-021-25771-5)
First, we we determined the base cell type of each cluster. The following violin plots show the expression of at least one relevant marker gene for each type.
VlnPlot(pbmc, features = c("CD3D", "NCAM1", "CD19", "CD14", "FCGR3A", "CD1C", "LILRA4", "CD34", "PPBP"))
Next, we examined each cell type more closely to determine the subtypes of each cluster.
Monocytes are usually divided into two main group, CD14+ and CD16+ (FCGR3A), as well as an intermediate type.
VlnPlot(pbmc, features = c("CD14", "FCGR3A"), idents = c(0,4,5,10,12))
Three marker genes were examined for B cells. MS4A1 is expressed by naive and memory B cells, but not by plasma cells. CD27 is expressed by memory B cells and plasma cells and CD38 is only expressed by plasma cells.
VlnPlot(pbmc, features = c("MS4A1", "CD27", "CD38"), idents = c(6,7,17,19))
Six marker genes were considered for determine T cell subtypes. CD4 and CD8A are used to differentiate between the two main subgroups, CD4+ and CD8+ T cells. CCR7 and S100A4 are used to differentiate between naive (only CCR7), central memory (both CCR7 and S100A4) and effector memory cells (only S100A4). Additionally, FOXP3 is a marker gene for CD4+ regulatory T cells, while TRDC indicates gamma delta T cells.
VlnPlot(pbmc, features = c("CD4", "CD8A", "CCR7", "S100A4", "FOXP3", "TRDC"), idents = c(1,2,3,9,11,12,14,15))
To visualize the distribution of the main cell types, the expression level of relevant marker are shown in feature plots (for both UMAP and tSNE).
FeaturePlot(pbmc, features = c("CD3D", "NCAM1", "CD19", "CD14", "FCGR3A", "CD1C", "LILRA4", "CD34", "PPBP"))
FeaturePlot(pbmc, features = c("CD3D", "NCAM1", "CD19", "CD14", "FCGR3A", "CD1C", "LILRA4", "CD34", "PPBP"), reduction = "tsne")
Expression levels of the ten most significant genes for each cluster are also visualized as a heatmap. Similarities between clusters supports those clusters belonging to the same cell type, e.g. 0, 4, 5, 10, 12 for Monocytes and 1, 2, 3, 9, 11 for T cells.
pbmc.markers %>%
group_by(cluster) %>%
top_n(n = 10, wt = avg_log2FC) -> top10
DoHeatmap(pbmc, features = top10$gene) + NoLegend()
Finally, UMAP and tSNE plots were annotated with the previously determined cluster identities. As expected, cells of the same type are generally clustered closely together.
new.cluster.ids <- c("CD14+ Monocytes",
"CD8+ naive T cells",
"CD4+ memory T cells",
"CD4+ naive T cells",
"CD14+ Monocytes",
"Intermediate Monocytes",
"Memory B cells",
"Naive B cells",
"NK cells",
"CD8+ effector memory T cells",
"CD16+ Monocytes",
"CD8+ central memory T cells",
"Unknown, likely Monocytes",
"Myeloid dendritic cells",
"Gamma delta T cells",
"CD4+ regulatory T cells",
"Plasmocytoid dentritic cells",
"Intermediate B cells",
"Platelets",
"Plasma cells",
"Hematopoietic stem cells")
names(new.cluster.ids) <- levels(pbmc)
pbmc <- RenameIdents(pbmc, new.cluster.ids)
dimplot1 <- DimPlot(pbmc, reduction = "umap", label = TRUE, pt.size = 0.5, repel = TRUE, label.box = TRUE) + NoLegend()
dimplot1$layers[[3]] <- unserialize(serialize(dimplot1$layers[[2]], NULL))
dimplot1$layers[[2]]$geom_params$seed <- 1234
dimplot1$layers[[3]]$geom_params$seed <- 1234
dimplot1$layers[[2]]$aes_params$alpha <- 0.5
dimplot1$layers[[3]]$aes_params$fill <- NA
dimplot1
dimplot2 <- DimPlot(pbmc, reduction = "tsne", label = TRUE, pt.size = 0.5, repel = TRUE, label.box = TRUE) + NoLegend()
dimplot2$layers[[3]] <- unserialize(serialize(dimplot2$layers[[2]], NULL))
dimplot2$layers[[2]]$geom_params$seed <- 1234
dimplot2$layers[[3]]$geom_params$seed <- 1234
dimplot2$layers[[2]]$aes_params$alpha <- 0.5
dimplot2$layers[[3]]$aes_params$fill <- NA
dimplot2